29  Canonical Correlation - Practical

29.1 Canonical Correlation: Demonstration

Step One: Create synthetic dataset

First, I’ll create a ‘dummy’ dataset that we can use to explore the CCA technique.

Show code
# clean environment
rm(list = ls())

# Load necessary library
library(MASS)  # For generating correlated data

# Create a synthetic dataset
create_dataset <- function(n = 100, means, Sigma) {
    data <- MASS::mvrnorm(n = n, mu = means, Sigma = Sigma)
    colnames(data) <- c("DrivingAccuracy", "PuttingAccuracy", "AverageScore", "EaglesPerRound", 
                        "WindSpeed", "Rainfall", "Temperature", "CourseDifficulty")
    return(as.data.frame(data))
}

# Define means and a covariance matrix for the variables
means <- c(60, 50, 72, 0.5, 10, 5, 70, 7)
Sigma <- matrix(c(
    1, 0.2, 0, 0, -0.6, -0.7, 0, -0.1,
    0.2, 1, 0, 0, 0, 0, 0, 0,
    0, 0, 1, 0, 0, 0, 0, -0.5,
    0, 0, 0, 1, 0, 0, 0, 0,
    -0.6, 0, 0, 0, 1, 0, 0, 0,
    -0.7, 0, 0, 0, 0, 1, 0, 0,
    0, 0, 0, 0, 0, 0, 1, 0,
    -0.1, 0, -0.5, 0, 0, 0, 0, 1
), ncol = 8)

# Generate the dataset
set.seed(123) # For reproducibility
golf_dataset <- create_dataset(n = 100, means, Sigma)


# Round all columns in the dataframe to 2 decimal places
golf_dataset <- data.frame(lapply(golf_dataset, function(x) {
    if(is.numeric(x)) round(x, 2) else x
}))

rm(Sigma)

Step Two: Examine the correlations

Remember that CCA is about the correlations that exist between the variables in our two sets of variables. Before digging too deep, it’s helpful to explore where those underlying correlations might exist.

# Load packages
library(ggplot2)
library(corrplot)
corrplot 0.92 loaded
cor_matrix <- cor(golf_dataset) # create the correlation matrix
cov_matrix <- cov(golf_dataset) # create the covariance matrix

# Visualise  correlation matrix
corrplot(cor_matrix, method = "number")

# more visualisations of the variables

# Example 1

# Scatterplot for Putting Accuracy vs Rainfall
ggplot(golf_dataset, aes(x = Rainfall, y = PuttingAccuracy)) +
  geom_point() +
    geom_smooth(method = "lm", color = "blue", se = FALSE) +
  labs(title = "Scatterplot of Putting Accuracy vs Rainfall",
       x = "Rainfall (mm)",
       y = "Putting Accuracy (%)") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

## Example 2

# Categorise Wind Speed into 'Low', 'Medium', and 'High'
golf_dataset$WindSpeedCategory <- cut(golf_dataset$WindSpeed, 
                                      breaks = quantile(golf_dataset$WindSpeed, probs = 0:3/3),
                                      labels = c("Low", "Medium", "High"), 
                                      include.lowest = TRUE)

# Nested Scatterplot
ggplot(golf_dataset, aes(x = DrivingAccuracy, y = AverageScore)) +
  geom_point() +
    geom_smooth(method = "lm", color = "blue", se = FALSE) +
  facet_wrap(~ WindSpeedCategory, scales = "free") +
  labs(title = "Nested Scatterplot of Driving and Average Score by Wind Speed",
       x = "Driving Accuracy (%)",
       y = "Average Score (%)",
       caption = "Wind Speed Categories: Low, Medium, High") +
  theme_minimal()
`geom_smooth()` using formula = 'y ~ x'

Step Three: Load libraries for CCA

I need four libraries for further CCA.

Show code
library(ggplot2)
library(GGally)
library(CCA)
library(CCP)

Step Four: Create variable sets

I must split the dataset into two parts; one contains the performance-related variables, and one contains the condition-related variables. Remember, we’re trying to explore how these two sets are associated.

performance <- golf_dataset[,1:4]
conditions <- golf_dataset[,5:8]

Step Five: Calculate correlation matrix

I use the matcor function to examine the correlations between and within the two sets of variables.

matcor(performance, conditions)
$Xcor
                DrivingAccuracy PuttingAccuracy AverageScore EaglesPerRound
DrivingAccuracy      1.00000000      0.24446707  -0.02859946     0.08332380
PuttingAccuracy      0.24446707      1.00000000  -0.13029905    -0.02167584
AverageScore        -0.02859946     -0.13029905   1.00000000    -0.03680759
EaglesPerRound       0.08332380     -0.02167584  -0.03680759     1.00000000

$Ycor
                   WindSpeed    Rainfall Temperature CourseDifficulty
WindSpeed         1.00000000 -0.08119233 0.034087943      0.049410258
Rainfall         -0.08119233  1.00000000 0.092936666     -0.155493878
Temperature       0.03408794  0.09293667 1.000000000      0.005854577
CourseDifficulty  0.04941026 -0.15549388 0.005854577      1.000000000

$XYcor
                 DrivingAccuracy PuttingAccuracy AverageScore EaglesPerRound
DrivingAccuracy       1.00000000      0.24446707  -0.02859946    0.083323800
PuttingAccuracy       0.24446707      1.00000000  -0.13029905   -0.021675839
AverageScore         -0.02859946     -0.13029905   1.00000000   -0.036807586
EaglesPerRound        0.08332380     -0.02167584  -0.03680759    1.000000000
WindSpeed            -0.70015145     -0.21054255  -0.09836886   -0.003583522
Rainfall             -0.54019157      0.12177734   0.08711485   -0.094905910
Temperature          -0.01870483      0.07397267   0.07251805    0.032765984
CourseDifficulty     -0.02663877      0.06502431  -0.45005940    0.046373353
                    WindSpeed    Rainfall  Temperature CourseDifficulty
DrivingAccuracy  -0.700151446 -0.54019157 -0.018704827     -0.026638766
PuttingAccuracy  -0.210542546  0.12177734  0.073972668      0.065024305
AverageScore     -0.098368857  0.08711485  0.072518051     -0.450059396
EaglesPerRound   -0.003583522 -0.09490591  0.032765984      0.046373353
WindSpeed         1.000000000 -0.08119233  0.034087943      0.049410258
Rainfall         -0.081192333  1.00000000  0.092936666     -0.155493878
Temperature       0.034087943  0.09293667  1.000000000      0.005854577
CourseDifficulty  0.049410258 -0.15549388  0.005854577      1.000000000

Step Six: Display Canonical Correlations

Most importantly, I can display the canonical correlations. these tell me the overall associations between the two sets of variables. The higher the number in relative terms, the stronger the relationship.

We will have the same number of canonical correlations as the smallest number of variables in either set.

cc1 <- cc(performance, conditions) 

# display the canonical correlations
cc1$cor
[1] 0.94379531 0.45876017 0.25324247 0.05955601

Step Seven: Display Raw Canonical Coefficients

These help us explore the association between the sets further. They tells us strongly each variable contributed to each of the relationships represented by our canonical correlation.

# print the raw canonical coefficients
cc1[3:4]
$xcoef
                       [,1]        [,2]         [,3]        [,4]
DrivingAccuracy -1.08227539  0.11331569 -0.079561322 -0.09982441
PuttingAccuracy  0.16242042 -0.13146219  1.048793017  0.18440570
AverageScore    -0.11465183 -1.05327755  0.005989217  0.12548457
EaglesPerRound   0.01870145  0.07786462 -0.123061561  1.03675084

$ycoef
                        [,1]        [,2]       [,3]        [,4]
WindSpeed         0.79373152  0.05151688 -0.6066692  0.17292367
Rainfall          0.79795176 -0.23216641  0.7836104 -0.30244988
Temperature      -0.06446521 -0.15924958  0.2321903  0.92373177
CourseDifficulty  0.15982620  0.92216313  0.3655098  0.05810928

Step Eight: Display Canonical Loading

A similar concept can be found in canonical loading, which are the correlations between the variables and the canonical variates. Remember, these variates are a type of latent variable.

# compute canonical loadings
cc2 <- comput(performance, conditions, cc1)

# display canonical loadings
cc2[3:6]
$corr.X.xscores
                       [,1]        [,2]       [,3]       [,4]
DrivingAccuracy -0.98032302  0.11101102  0.1607338 0.02842514
PuttingAccuracy -0.08054709  0.02830678  0.9894816 0.11677772
AverageScore    -0.10017785 -0.98590608 -0.1189206 0.06173722
EaglesPerRound  -0.06667113  0.12309177 -0.1464270 0.97926631

$corr.Y.xscores
                       [,1]        [,2]        [,3]         [,4]
WindSpeed        0.69406502  0.04931556 -0.15929457  0.013576706
Rainfall         0.56044029 -0.16712324  0.17484452 -0.011607942
Temperature      0.02338214 -0.08114820  0.07236623  0.056072534
CourseDifficulty 0.08703480  0.44090779  0.05939162  0.006740227

$corr.X.yscores
                       [,1]        [,2]        [,3]        [,4]
DrivingAccuracy -0.92522427  0.05092743  0.04070462 0.001692888
PuttingAccuracy -0.07601996  0.01298602  0.25057877 0.006954815
AverageScore    -0.09454739 -0.45229444 -0.03011576 0.003676822
EaglesPerRound  -0.06292390  0.05646960 -0.03708154 0.058321196

$corr.Y.yscores
                       [,1]       [,2]       [,3]       [,4]
WindSpeed        0.73539783  0.1074975 -0.6290200  0.2279653
Rainfall         0.59381551 -0.3642933  0.6904234 -0.1949080
Temperature      0.02477459 -0.1768859  0.2857587  0.9415092
CourseDifficulty 0.09221788  0.9610856  0.2345247  0.1131746

Step Nine: Test the Canonical Dimensions

Finally, we can test what we’ve found. We look to see the significance of each of the canonical dimensions identified earlier and, if p < 0.05, we can say that there is a significant association overall between the two sets of variables.

# tests of canonical dimensions
rho <- cc1$cor
## Define number of observations, number of variables in first set, and number of variables in the second set.
n <- dim(conditions)[1]
p <- length(conditions)
q <- length(performance)

## Calculate p-values using the F-approximations of different test statistics:
p.asym(rho, n, p, q, tstat = "Wilks")
Wilks' Lambda, using F-approximation (Rao's F):
              stat     approx df1      df2      p.value
1 to 4:  0.0804393 22.5671350  16 281.7023 0.0000000000
2 to 4:  0.7362838  3.3732631   9 226.4882 0.0006674902
3 to 4:  0.9325488  1.6700815   4 188.0000 0.1586524218
4 to 4:  0.9964531  0.3381567   1  95.0000 0.5622724374

Now I’ll do the same but using standardised conditions:

# standardised conditions canonical coefficients diagonal matrix of conditions sd's

s1 <- diag(sqrt(diag(cov(conditions))))
s1 %*% cc1$xcoef
            [,1]        [,2]        [,3]        [,4]
[1,] -1.07126911  0.11216332 -0.07875222 -0.09880924
[2,]  0.14021274 -0.11348741  0.90539195  0.15919197
[3,] -0.11910341 -1.09417309  0.00622176  0.13035676
[4,]  0.01882733  0.07838872 -0.12388987  1.04372908
# standardised performance canonical coefficients diagonal matrix of performance sd's

s2 <- diag(sqrt(diag(cov(performance))))
s2 %*% cc1$ycoef
            [,1]        [,2]       [,3]        [,4]
[1,]  0.75026631  0.04869579 -0.5734477  0.16345426
[2,]  0.76542714 -0.22270328  0.7516703 -0.29012198
[3,] -0.06099155 -0.15066853  0.2196789  0.87395716
[4,]  0.15344115  0.88532277  0.3509077  0.05578782

29.2 Canonical Correlation Analysis: Practice

First, run the following code in RStudio which will create a synthetic dataset called health_dataset.

rm(list = ls())

# Load necessary library
library(MASS)  # For generating correlated data

# Function to create a synthetic dataset
create_dataset <- function(n = 100, means, Sigma) {
    data <- MASS::mvrnorm(n = n, mu = means, Sigma = Sigma)
    colnames(data) <- c("CalorieIntake", "ProteinContent", "VitaminLevels", "WaterIntake",
                        "StressLevel", "MoodRating", "SleepQuality", "ConcentrationLevel")
    return(as.data.frame(data))
}

# Define means and a covariance matrix for the variables
# These are example values and can be adjusted
means <- c(2000, 50, 100, 2, 5, 7, 7, 7)  # Example mean values for each variable
Sigma <- matrix(c(
    1, 0.5, 0, 0, 0, 0.3, 0.3, 0,
    0.5, 1, 0, 0, 0, 0.2, 0.4, 0,
    0, 0, 1, 0, -0.2, 0.2, 0, 0,
    0, 0, 0, 1, -0.3, 0.1, 0.2, 0.1,
    0, 0, -0.2, -0.3, 1, -0.4, -0.5, -0.3,
    0.3, 0.2, 0.2, 0.1, -0.4, 1, 0.6, 0.4,
    0.3, 0.4, 0, 0.2, -0.5, 0.6, 1, 0.5,
    0, 0, 0, 0.1, -0.3, 0.4, 0.5, 1
), ncol = 8)

# Generate the dataset
set.seed(123) # For reproducibility
health_dataset <- create_dataset(n = 500, means, Sigma)

# Calculate  correlation and covariance matrix

cor_matrix <- cor(health_dataset) # create the correlation matrix
cov_matrix <- cov(health_dataset) # create the covariance matrix

Now, examine the correlations between the variables. Use any of the techniques you have previously learned about.

# Load packages
library(ggplot2)
library(corrplot)

cor_matrix <- cor(health_dataset) # create the correlation matrix
cov_matrix <- cov(health_dataset) # create the covariance matrix

# Visualise  correlation matrix
corrplot(cor_matrix, method = "number")
# more visualisations of the variables

# Example 1

# Scatterplot of Calorie Intake and Sleep Quality
ggplot(health_dataset, aes(x = CalorieIntake, y = SleepQuality)) +
  geom_point() +
    geom_smooth(method = "lm", color = "blue", se = FALSE) +
  labs(title = "Scatterplot of Calorie Intake and Sleep Quality",
       x = "Intake (cals)",
       y = "Sleep Quality (low - high)") +
  theme_bw()
## Example 2

# Categorise Mood Rating into 'Low', 'Medium', and 'High'
health_dataset$MoodRatingCat <- cut(health_dataset$MoodRating, 
                                      breaks = quantile(health_dataset$MoodRating, probs = 0:3/3),
                                      labels = c("Low", "Medium", "High"), 
                                      include.lowest = TRUE)

# Nested Scatterplot
ggplot(health_dataset, aes(x = WaterIntake, y = ConcentrationLevel)) +
  geom_point() +
    geom_smooth(method = "lm", color = "blue", se = FALSE) +
  facet_wrap(~ MoodRatingCat, scales = "free") +
  labs(title = "Nested Scatterplot of Water Intake and Concentration Level by Mood Rating",
       x = "Water Intake (l)",
       y = "Concentration Level (low - high)",
       caption = "Mood Categories: Low, Medium, High") +
  theme_minimal()

Continue to apply the steps listed above to this new dataset. Examine the overall canonical coefficients between the two sets of variables to see if they are closely associated, and examine which variables contribute to each of the canonical correlations.

Show solution
library(ggplot2)
library(GGally)
library(CCA)
library(CCP)


nutrition <- health_dataset[,1:4]
health <- health_dataset[,5:8]

matcor(health, nutrition)
$Xcor
                   StressLevel MoodRating SleepQuality ConcentrationLevel
StressLevel          1.0000000 -0.3369012   -0.4442551         -0.2897950
MoodRating          -0.3369012  1.0000000    0.6313190          0.3893236
SleepQuality        -0.4442551  0.6313190    1.0000000          0.5163001
ConcentrationLevel  -0.2897950  0.3893236    0.5163001          1.0000000

$Ycor
               CalorieIntake ProteinContent VitaminLevels WaterIntake
CalorieIntake     1.00000000      0.4820434   -0.01903186 -0.08273793
ProteinContent    0.48204339      1.0000000    0.05255840 -0.12470723
VitaminLevels    -0.01903186      0.0525584    1.00000000  0.09400831
WaterIntake      -0.08273793     -0.1247072    0.09400831  1.00000000

$XYcor
                   StressLevel  MoodRating SleepQuality ConcentrationLevel
StressLevel         1.00000000 -0.33690116  -0.44425507        -0.28979498
MoodRating         -0.33690116  1.00000000   0.63131898         0.38932362
SleepQuality       -0.44425507  0.63131898   1.00000000         0.51630012
ConcentrationLevel -0.28979498  0.38932362   0.51630012         1.00000000
CalorieIntake       0.05910655  0.30558553   0.31866607        -0.01400816
ProteinContent     -0.01679902  0.24243604   0.42896787         0.03826109
VitaminLevels      -0.17798550  0.10208330  -0.04415736        -0.11449625
WaterIntake        -0.36191297  0.09155184   0.12017460         0.04574796
                   CalorieIntake ProteinContent VitaminLevels WaterIntake
StressLevel           0.05910655    -0.01679902   -0.17798550 -0.36191297
MoodRating            0.30558553     0.24243604    0.10208330  0.09155184
SleepQuality          0.31866607     0.42896787   -0.04415736  0.12017460
ConcentrationLevel   -0.01400816     0.03826109   -0.11449625  0.04574796
CalorieIntake         1.00000000     0.48204339   -0.01903186 -0.08273793
ProteinContent        0.48204339     1.00000000    0.05255840 -0.12470723
VitaminLevels        -0.01903186     0.05255840    1.00000000  0.09400831
WaterIntake          -0.08273793    -0.12470723    0.09400831  1.00000000
Show solution
cc1 <- cc(health, nutrition) 

# display the canonical correlations
cc1$cor
[1] 0.56465384 0.41843772 0.28046195 0.05744737
Show solution
# print the raw canonical coefficients
cc1[3:4]
$xcoef
                         [,1]        [,2]       [,3]        [,4]
StressLevel        -0.4971167 -1.07173935 -0.2629037  0.04508267
MoodRating         -0.1147914  0.25316327 -1.1728372  0.59898131
SleepQuality       -1.1761224 -0.08892745  0.6955433 -0.45872628
ConcentrationLevel  0.4317114 -0.36619812  0.4509505  0.88960757

$ycoef
                      [,1]        [,2]       [,3]       [,4]
CalorieIntake  -0.45332153 -0.00395035 -0.8067794  0.6848746
ProteinContent -0.69438612  0.13102271  0.6286997 -0.6355298
VitaminLevels   0.16299989  0.50993146 -0.6403170 -0.5617179
WaterIntake    -0.07738892  0.81074724  0.3907667  0.4494141
Show solution
# compute canonical loadings
cc2 <- comput(health, nutrition, cc1)

# display canonical loadings
cc2[3:6]
$corr.X.xscores
                          [,1]        [,2]       [,3]       [,4]
StressLevel        -0.02790634 -0.92740648 -0.3062283 -0.2129852
MoodRating         -0.52767681  0.37655176 -0.4294424  0.6287648
SleepQuality       -0.81592845  0.31257772  0.3298658  0.3574136
ConcentrationLevel -0.07574421 -0.03851795  0.4524690  0.8877223

$corr.Y.xscores
                      [,1]         [,2]        [,3]        [,4]
CalorieIntake  -0.44309270 -0.007216945 -0.14452641  0.01976505
ProteinContent -0.50805428  0.023305281  0.04654792 -0.02296315
VitaminLevels   0.07210025  0.248708578 -0.15591026 -0.03256735
WaterIntake     0.03517414  0.354045028  0.08954634  0.02425571

$corr.X.yscores
                          [,1]        [,2]        [,3]        [,4]
StressLevel        -0.01575742 -0.38806185 -0.08588539 -0.01223544
MoodRating         -0.29795474  0.15756346 -0.12044226  0.03612088
SleepQuality       -0.46071714  0.13079431  0.09251480  0.02053247
ConcentrationLevel -0.04276926 -0.01611736  0.12690035  0.05099731

$corr.Y.yscores
                      [,1]        [,2]       [,3]       [,4]
CalorieIntake  -0.78471564 -0.01724736 -0.5153156  0.3440549
ProteinContent -0.89976236  0.05569594  0.1659687 -0.3997249
VitaminLevels   0.12768929  0.59437419 -0.5559052 -0.5669075
WaterIntake     0.06229328  0.84611165  0.3192816  0.4222249
Show solution
# tests of canonical dimensions
rho <- cc1$cor
## Define number of observations, number of variables in first set, and number of variables in the second set.
n <- dim(nutrition)[1]
p <- length(nutrition)
q <- length(health)

## Calculate p-values using the F-approximations of different test statistics:
p.asym(rho, n, p, q, tstat = "Wilks")
Wilks' Lambda, using F-approximation (Rao's F):
              stat    approx df1      df2      p.value
1 to 4:  0.5159936 22.727041  16 1503.722 0.000000e+00
2 to 4:  0.7575151 16.116372   9 1199.983 0.000000e+00
3 to 4:  0.9183005 10.753485   4  988.000 1.555599e-08
4 to 4:  0.9966998  1.639008   1  495.000 2.010612e-01
Show solution
# standardised nutrition canonical coefficients diagonal matrix of nutrition sd's

s1 <- diag(sqrt(diag(cov(nutrition))))
s1 %*% cc1$xcoef
           [,1]        [,2]       [,3]        [,4]
[1,] -0.4935742 -1.06410223 -0.2610303  0.04476141
[2,] -0.1158969  0.25560131 -1.1841320  0.60474970
[3,] -1.1781239 -0.08907878  0.6967269 -0.45950693
[4,]  0.4335844 -0.36778684  0.4529069  0.89346705
Show solution
# standardised health canonical coefficients diagonal matrix of health sd's

s2 <- diag(sqrt(diag(cov(health))))
s2 %*% cc1$ycoef
            [,1]         [,2]       [,3]       [,4]
[1,] -0.42024125 -0.003662081 -0.7479062  0.6348972
[2,] -0.67013641  0.126447063  0.6067439 -0.6133355
[3,]  0.16343858  0.511303860 -0.6420403 -0.5632297
[4,] -0.07935932  0.831389641  0.4007160  0.4608566